library(curatedOvarianData)
## Loading required package: affy
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
data(package="curatedOvarianData")
The TCGA_eset dataset contains gene expression data for 13,104 features (genes) across 578 samples. The data has been preprocessed and normalized using a standardized pipeline to remove batch effects and other sources of technical variability. The resulting dataset contains 20,501 genes and is stored as an ExpressionSet object in R, which includes not only the gene expression data but also sample annotation information and other metadata.
data(TCGA_eset)
# Get gene names
gene_names <- featureNames(TCGA_eset)
head(gene_names, 10)
## [1] "A1CF" "A2M" "A4GNT" "AAAS" "AACS" "AADAC" "AAGAB" "AAK1" "AAMDC"
## [10] "AAMP"
we can access this information using the pData() and fData() functions from the Biobase package.
#summary(TCGA_eset)
#dim(TCGA_eset)
library(Biobase)
sample_metadata <- pData(TCGA_eset)
#head(sample_metadata)
feature_metadata <- fData(TCGA_eset)
#head(feature_metadata)
library(curatedOvarianData)
library(pheatmap)
# Extract gene expression data
exprs_matrix <- exprs(TCGA_eset)
# Set row names to gene names
rownames(exprs_matrix) <- featureNames(TCGA_eset)
# Subset to top 20 most variable genes by custom filter
exprs_matrix <- exprs_matrix[apply(exprs_matrix, 1, function(x) var(x) > 0.5), ][20:40, ]
# Create heatmap
pheatmap(exprs_matrix, scale = "row", clustering_distance_rows = "euclidean", show_rownames = T, show_colnames=F)
### The plot shows a heatmap of the expression levels of the top 20 most
variable genes (determined by a custom filter) in a subset of samples
from the TCGA ovarian cancer dataset. Each row represents a gene, and
each column represents a sample. The color scale represents the level of
expression for each gene in each sample, with red indicating high
expression and blue indicating low expression. The rows are clustered
using euclidean distance and the columns are not clustered
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.14.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
## ! pheatmap() has been masked by ComplexHeatmap::pheatmap(). Most of the arguments
## in the original pheatmap() are identically supported in the new function. You
## can still use the original function by explicitly calling pheatmap::pheatmap().
##
## Attaching package: 'ComplexHeatmap'
## The following object is masked from 'package:pheatmap':
##
## pheatmap
top_gene_expression <- exprs(TCGA_eset)
sample_metadata <- pData(TCGA_eset)
top_gene_expression_t <- t(top_gene_expression)
combined_data <- data.frame(Sample_Type = sample_metadata$sample_type, top_gene_expression_t)
#non_na_indices contain the indices of all samples that have a non-missing value
non_na_indices <- which(!is.na(sample_metadata$sample_type))
# we create a new data frame with the Sample_Type column, using only the rows corresponding to the non_na_indices.
sample_annotation_filtered <- data.frame(Sample_Type = sample_metadata$sample_type[non_na_indices])
rownames(sample_annotation_filtered) <- rownames(sample_metadata)[non_na_indices]
# filter the columns of the top_gene_expression matrix to include only those that correspond to the non_na_indices
top_gene_expression_filtered <- top_gene_expression[, non_na_indices]
col_fun <- colorRampPalette(c("blue", "white", "red"))(100)
Heatmap(top_gene_expression_filtered,
col = col_fun,
name = "Expression",
cluster_rows = TRUE,
cluster_columns = TRUE,
show_column_names = FALSE,
show_row_names = FALSE,
top_annotation = HeatmapAnnotation(df = sample_annotation_filtered$Sample_Type))
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
### The heatmap is showing the gene expression levels for the top
variable genes in the dataset, with rows representing genes and columns
representing individual samples. The color scale represents the relative
expression levels of the genes, with red indicating high expression and
blue indicating low expression. The heatmap can be used to identify
patterns of gene expression that distinguish between tumor and healthy
samples.
library(corrplot)
## corrplot 0.92 loaded
gene_expression <- exprs(TCGA_eset)
# Select a subset of genes (e.g., the first 20 genes)
selected_genes <- gene_expression[1:20, ]
# Select a subset of samples (e.g., the first 100 samples)
selected_samples <- selected_genes[, 1:100]
correlation_matrix_subset <- cor(t(selected_samples), method = "pearson")
corrplot(correlation_matrix_subset, method = "color", type = "upper", mar = c(0, 0, 1, 1))
### This plot is showing the correlation between the expression levels
of a subset of genes (the first 20 genes) in a subset of samples (the
first 100 samples) from the TCGA_eset dataset. The plot is a correlation
matrix, where each cell represents the correlation between the
expression levels of two genes. The color of each cell indicates the
strength of the correlation, with darker colors indicating stronger
positive or negative correlations.
library(corrplot)
gene_expression <- exprs(TCGA_eset)
# Select a subset of genes (e.g., the first 100 genes)
selected_genes <- gene_expression[1:100, ]
# Select a subset of samples (e.g., the first 100 samples)
selected_samples <- selected_genes[, 1:100]
correlation_matrix_subset <- cor(t(selected_samples), method = "pearson")
corrplot(correlation_matrix_subset, method = "color", type = "upper", mar = c(0, 0, 1, 1),tl.pos="n")
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:BiocGenerics':
##
## normalize, path, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
# Load the TCGA_eset dataset
data(TCGA_eset)
# Extract the gene expression data
tcga.exprs <- exprs(TCGA_eset)
# Subset the gene expression data to the first 30 samples
tcga.exprs <- tcga.exprs[, 1:30]
# Log2 transformation of the gene expression data
log2.exprs <- log2(tcga.exprs + 1)
# Calculate the variance for each gene
gene.variance <- apply(log2.exprs, 1, var)
# Order genes by variance and select the top 100
top.genes <- head(order(gene.variance, decreasing = TRUE), 100)
# Subset the gene expression data with the top 50 genes
subset.exprs <- log2.exprs[top.genes, ]
# Generate a correlation matrix of the gene expression data
cor.matrix <- cor(subset.exprs)
# Create an igraph graph object
graph <- graph.adjacency(cor.matrix, mode = "undirected", weighted = TRUE)
# Set node color based on gene expression level
node.color <- ifelse(rowMeans(subset.exprs) > log2(6), "red", "blue")
# Plot the graph with node color based on gene expression level
plot(graph, vertex.size =8, vertex.color = node.color,vertex.label.cex = 0.7, edge.width = E(graph)$weight*2)
## Warning in v(graph): Non-positive edge weight found, ignoring all weights
## during graph layout.
The node color is determined by the average gene expression level. If the mean expression level is greater than log2(6), the node is colored red, otherwise, it’s colored blue. This plot can help visualize and explore the relationships between the most variably expressed genes in the dataset. It can provide insights into the co-expression patterns, which can be useful for understanding gene function, regulation, and interactions. However, it’s essential to keep in mind that correlation does not imply causation, and further analyses or experimental validation may be necessary to confirm the relationships between the genes.
library(affy)
library(circlize)
## ========================================
## circlize version 0.4.15
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(circlize))
## ========================================
##
## Attaching package: 'circlize'
## The following object is masked from 'package:igraph':
##
## degree
# Load the TCGA_eset dataset
data(TCGA_eset)
# Extract gene expression data
gene_expression <- exprs(TCGA_eset)
# Calculate variance for each gene
gene_variance <- apply(gene_expression, 1, var)
# Get the indices of the top 6 genes with the highest variance
top_genes <- order(gene_variance, decreasing = TRUE)[10:15]
# Randomly select 6 samples
selected_samples <-15:20
# Subset the gene expression data with the top 5 genes and the selected 10 samples
subset_exprs <- gene_expression[top_genes, selected_samples]
# Create the chord diagram
chordDiagram(subset_exprs)
# Set the font size of the labels
circos.text(0, seq(0, 1, by = 1/length(top_genes)), top_genes,
facing =
"outside", niceFacing = T, cex = 0.001, adj = c(-0.05, 0.05))
# Load necessary libraries
library(GEOquery)
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(networkD3)
# Download the GSE113690 dataset
gse <- getGEO("GSE113690")
## Found 1 file(s)
## GSE113690_series_matrix.txt.gz
eset <- gse[[1]]
# Extract gene expression data
exprs <- exprs(eset)
# Calculate variance for each gene
gene_variance <- apply(exprs, 1, var)
# Get the indices of the top 20 genes with the highest variance
top_genes <- head(order(gene_variance, decreasing = TRUE), 20)
# Randomly select 20 samples
set.seed(1)
selected_samples <- sample(ncol(exprs), 20)
# Subset the gene expression data with the top 20 genes and the selected 20 samples
subset_exprs <- exprs[top_genes, selected_samples]
# Calculate correlation matrix
cor_mat <- cor(subset_exprs)
# Convert correlation matrix to edge list
edge_list <- as.data.frame(as.table(cor_mat))
colnames(edge_list) <- c("source", "target", "weight")
# Define node colors and sizes
# Create the simple network
simpleNetwork(edge_list,linkColour = "blue")
library(visNetwork)
library(visNetwork)
set.seed(1)
# Create a data frame with nodes and edges
nodes <- data.frame(id = 1:10,
label = paste0("Node ", 1:10),
value = sample(1:10, 10, replace = TRUE),
color = c("red", "blue", "green", "orange", "purple",
"yellow", "pink", "brown", "grey", "black"))
edges <- data.frame(from = sample(1:10, 20, replace = TRUE),
to = sample(1:10, 20, replace = TRUE))
# Create the network visualization
visNetwork(nodes, edges) %>%
visOptions(highlightNearest = TRUE,
nodesIdSelection = TRUE) %>%
visLayout(randomSeed = 123)
library(nycflights13)
library(visNetwork)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:igraph':
##
## as_data_frame, groups, union
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
flights_subset <- flights %>%
group_by(origin) %>%
sample_n(size = 10)
# create nodes and edges data frames
nodes <- data.frame(id = unique(c(flights_subset$origin, flights_subset$dest)),
label = unique(c(flights_subset$origin, flights_subset$dest)))
edges <- data.frame(from = flights_subset$origin, to = flights_subset$dest)
# create the network visualization
visNetwork(nodes, edges, width = "100%") %>%
visNodes(shape = "dot", size = 10) %>%
visEdges(arrows = "to", smooth = TRUE) %>%
visLayout(randomSeed = 123)
library(DiagrammeR)
##
## Attaching package: 'DiagrammeR'
## The following object is masked from 'package:igraph':
##
## count_automorphisms
grViz("
digraph {
# Set node defaults
node [shape = ellipse, style = filled, color = skyblue, fontname = 'Helvetica']
# Add nodes to the graph
A [label = 'Gene A']
B [label = 'Gene B']
C [label = 'Gene C']
D [label = 'Gene D']
E [label = 'Gene E']
F [label = 'Gene F']
G [label = 'Gene G']
H [label = 'Gene H']
I [label = 'Gene I']
J [label = 'Gene J']
K [label = 'Gene K']
L [label = 'Gene L']
M [label = 'Gene M']
N [label = 'Gene N']
# Set edge defaults
edge [color = red]
# Add edges to the graph
A -> B
A -> C
B -> D
C -> D
C -> E
D -> F
E -> F
F -> G
F -> H
G -> I
H -> I
I -> J
I -> K
J -> L
K -> L
L -> M
L -> N
}
")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.2 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%--%() masks igraph::%--%()
## ✖ tibble::as_data_frame() masks dplyr::as_data_frame(), igraph::as_data_frame()
## ✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## ✖ purrr::compose() masks igraph::compose()
## ✖ tidyr::crossing() masks igraph::crossing()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ lubridate::pm() masks affy::pm()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::simplify() masks igraph::simplify()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Generate mock data
set.seed(123)
metabolites <- paste0("Metabolite", 1:20)
n_endpoints <- sample(1:20, 100, replace = TRUE)
rank <- rank(-n_endpoints)
data <- data.frame(Metabolite = metabolites, N_adverse_events = n_endpoints, Rank = rank)
# Create the brick plot
ggplot(data, aes(x = Rank, y = N_adverse_events, fill = N_adverse_events)) +
geom_tile() +
geom_text(aes(label = Metabolite), size = 2, color = "white") +
scale_fill_gradient(low = "#FDE725", high = "#440154") +
coord_flip() +
xlab("Rank") +
ylab("Number of associated adverse events") +
theme_bw()
library(tidyverse)
# Load the airquality dataset
data(airquality)
# Create a summary table of the number of missing values for each variable
missing_values <- airquality %>%
summarize_all(~sum(is.na(.))) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "N_Missing") %>%
arrange(desc(N_Missing))
# Create the brick plot
ggplot(missing_values, aes(x = rank(-N_Missing), y = N_Missing, fill = N_Missing)) +
geom_tile() +
geom_text(aes(label = Variable), size = 3, color = "white") +
scale_fill_gradient(low = "#FDE725", high = "#440154") +
coord_flip() +
xlab("Rank") +
ylab("Number of missing values") +
theme_bw()
options(digits = 10)
library(limma)
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
# Create a design matrix using your sample group information
design <- model.matrix(~ 0 + sample_metadata$sample_type)
# the linear model is fit to the gene expression data using the sample metadata information to define groups
fit <- lmFit(TCGA_eset, design)
# empirical Bayes statistics are applied to obtain moderated t-statistics for each gene.
#The moderated t-statistics are used to rank the genes based on differential expression between groups
fit <- eBayes(fit)
#summary(fit)
#topTable function is to extract the top genes based on this ranking
#adjust parameter is set to "fdr" to adjust the p-values for multiple testing using the false discovery rate method.
#significance threshold is defined to identify the differentially expressed genes based on the adjusted p-value
results <- topTable(fit, number = nrow(TCGA_eset), adjust = "fdr", sort.by = "none")
head(results)
## probeset gene sample_metadata.sample_typehealthy
## A1CF 220951_s_at A1CF 3.003843623
## A2M 217757_at A2M 9.535445875
## A4GNT 221131_at A4GNT 3.257179259
## AAAS 218075_at AAAS 4.893685168
## AACS 218434_s_at AACS 6.640660185
## AADAC 205969_at AADAC 3.996337787
## sample_metadata.sample_typetumor AveExpr F P.Value
## A1CF 3.111787496 3.110293463 75617.201841 0
## A2M 8.792475255 8.802758585 27378.102036 0
## A4GNT 3.412591158 3.410440129 111434.329488 0
## AAAS 5.128383994 5.125135568 54033.481844 0
## AACS 6.910025674 6.906297432 41476.720695 0
## AADAC 4.902555544 4.890012738 6068.648534 0
## adj.P.Val
## A1CF 0
## A2M 0
## A4GNT 0
## AAAS 0
## AACS 0
## AADAC 0
options(digits = 4)
library(ggplot2)
# Prepare the data for the volcano plot
volcano_data <- data.frame(
genes = results$gene,
log2FoldChange = results$sample_metadata.sample_typetumor - results$sample_metadata.sample_typehealthy,
pvalue = results$adj.P.Val,
adj_pvalue = -log10(results$adj.P.Val) * sign(results$sample_metadata.sample_typetumor - results$sample_metadata.sample_typehealthy)
)
# Create the volcano plot
ggplot(volcano_data, aes(x = log2FoldChange, y = adj_pvalue)) +
geom_point(aes(color = adj_pvalue > 0.05), alpha = 0.7) +
scale_color_manual(values = c("blue", "red")) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
labs(x = "Log2 Fold Change", y = "-log10 adjusted p-value", title = "Differentially Expressed Genes") +
theme_classic()
library(dygraphs)
data(co2)
dygraph(co2, main = "Atmospheric CO2 Concentrations") %>%
dySeries("V1") %>% #specify which series (column) in the data frame to plot in the dygraph()
dyRangeSelector()
library(mixOmics)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: lattice
##
## Loaded mixOmics 6.22.0
## Thank you for using mixOmics!
## Tutorials: http://mixomics.org
## Bookdown vignette: https://mixomicsteam.github.io/Bookdown
## Questions, issues: Follow the prompts at http://mixomics.org/contact-us
## Cite us: citation('mixOmics')
##
## Attaching package: 'mixOmics'
## The following object is masked from 'package:purrr':
##
## map
set.seed(42)
selected_samples <- sample(colnames(TCGA_eset), 100)
# Subset the data
TCGA_eset_subset <- TCGA_eset[, selected_samples]
# PCA plot
pca_res <- pca(t(exprs(TCGA_eset_subset)), ncomp = 2)
plotIndiv(pca_res)
library(rbokeh)
# Create sample data
x <- rnorm(100)
y <- rnorm(100)
label <- paste("Point", 1:100 )
# Create plot
p <- figure() %>%
ly_points(x, y, hover = list(label),color="purple")
# Show plot
p
#install.packages("plotly")
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:igraph':
##
## groups
## The following object is masked from 'package:ComplexHeatmap':
##
## add_heatmap
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Select two genes randomly
genes <- sample(rownames(exprs(TCGA_eset)), 2)
# Create a data frame with the gene expression values
plot_data <- data.frame(x = exprs(TCGA_eset)[genes[1], ],
y = exprs(TCGA_eset)[genes[2], ])
# Create plot
plot_ly(plot_data, x = ~x, y = ~y, type = "scatter", mode = "markers")
# M values (log fold change)
logFC <- rnorm(nrow(TCGA_eset), mean = 0, sd = 1)
#A values (mean expression)
A_value <- rowMeans(exprs(TCGA_eset))
M_value <- logFC
ma_data <- data.frame(Gene = rownames(TCGA_eset),
A_value = A_value,
M_value = M_value)
ggplot(ma_data, aes(x = A_value, y = M_value)) +
geom_point(alpha = 0.5) +
xlab("A - Mean Expression") +
ylab("M - log Fold Change") +
theme_bw()
the x-axis showing the average expression level (A value) and the y-axis showing the log fold change (M value) between two conditions or groups
library(rgl)
library(plot3D)
library(plotrix)
##
## Attaching package: 'plotrix'
## The following object is masked from 'package:rgl':
##
## mtext3d
library(Rcmdr)
## Loading required package: splines
## Loading required package: RcmdrMisc
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
## Loading required package: sandwich
## Loading required package: effects
## Use the command
## lattice::trellis.par.set(effectsTheme())
## to customize lattice options for effects plots.
## See ?effectsTheme for details.
## The Commander GUI is launched only in interactive sessions
##
## Attaching package: 'Rcmdr'
## The following object is masked from 'package:base':
##
## errorCondition
# Create sample data
x <- rnorm(100)
y <- rnorm(100)
z <- rnorm(100)
# Create 3D scatterplot
scatter3d(x, y, z, main = "3D Scatterplot", xlab = "X", ylab = "Y", zlab = "Z")
## Loading required namespace: mgcv
## Warning in par3d(userMatrix = structure(c(1, 0, 0, 0, 0, 0.342020143325668, :
## font family "sans" not found, using "bitmap"